1 Setup

1.1 Packages

library(tidyverse)
library(cowplot)
library(GGally)
library(heatmaply)
library(sva)
library(limma)
library(biobroom)
library(ggridges)

1.2 Data

1.2.1 Metabolite Abundances

vpa.cell.neg.raw <- read_csv("./data/abundances/vpa_exp_hilic_target_negmode_abundance.csv")
Parsed with column specification:
cols(
  .default = col_double(),
  Samples = col_character(),
  Mode = col_character(),
  Type = col_character(),
  Group = col_character()
)
See spec(...) for full column specifications.
vpa.cell.pos.raw <- read_csv("./data/abundances/vpa_exp_hilic_target_posmode_abundance.csv")
Parsed with column specification:
cols(
  .default = col_double(),
  Samples = col_character(),
  Mode = col_character(),
  Type = col_character(),
  Group = col_character()
)
See spec(...) for full column specifications.

1.2.2 Compound Information

vpa.cell.neg.compound.info <- read_csv("./data/compound_info/vpa_exp_hilic_target_negmode_cmpnd_info.csv")
Parsed with column specification:
cols(
  compound_short = col_character(),
  compound_full = col_character(),
  formula = col_character(),
  mass = col_double(),
  rt = col_double(),
  cas_id = col_character()
)
vpa.cell.pos.compound.info <- read_csv("./data/compound_info/vpa_exp_hilic_target_posmode_cmpnd_info.csv")
Parsed with column specification:
cols(
  compound_short = col_character(),
  compound_full = col_character(),
  formula = col_character(),
  mass = col_double(),
  rt = col_double(),
  cas_id = col_character()
)

1.3 Functions

MissingPerSamplePlot <- function(raw.data, start.string) {
  # Counts the number of missing/NA values per sample and
  # percent compounds missing out of total number of compounds per sample
  # Then passes the result into a vertical bar plot, where each 
  # bar represents a single sample and the size of the bar 
  # is the % of compounds missing
  
  counted.na <- raw.data %>%
  select(starts_with(start.string)) %>% 
  mutate(
    count.na = apply(., 1, function(x) sum(is.na(x))),
    percent.na = (count.na / ncol(raw.data %>% select(starts_with(start.string)))) * 100
    ) %>%
  dplyr::select(count.na, percent.na) %>%
  bind_cols(
    raw.data %>% 
      select(Samples, Type)
      ) %>% 
  arrange(percent.na) %>% 
  mutate(f.order = factor(Samples, levels = Samples))
counted.na %>% 
  ggplot(aes(x = f.order, y = percent.na, fill = Type)) +
  geom_bar(stat = "identity") +
  geom_hline(yintercept = 20, color = "gray", size = 1, alpha = 0.8) +
  coord_flip()+
  xlab("Samples") +
  ylab("Percent missing values in sample") +
  theme(axis.text.y = element_text(size = 6)) 
}
MissingPerCompound <- function(raw.data, start.string){
  # Function to count in how many experimental samples each compound is missing
  # Also calculates the % missing:
  # (# NA per compound across all experimental samples) * 100 / (tot num of samples)
  
  raw.data %>% 
  filter(Type == "sample") %>% 
  select(Samples, starts_with(start.string)) %>% 
  gather(key = "Compound", value = "Abundance", -Samples) %>% 
  group_by(Compound) %>% 
  summarise(
    na_count = sum(is.na(Abundance)),
    n_samples = n(),
    percent_na = (na_count * 100 / n_samples)
    ) %>% 
  filter(na_count > 0) %>% 
  arrange(desc(percent_na))
}
ReplaceNAwMinLogTransformSingle <- function(raw.dataframe, start.prefix) {
  # Function to replace any NAs in each column with the minimum for that column, 
  # separately for each sample type.
  # NA in negative control samples are replaced by 2.
  # Then data is log2 transformed
  
  smpls <- raw.dataframe %>%
    filter(Type == "sample") %>% 
    dplyr::select(starts_with(start.prefix))
  smpls.min <- lapply(smpls, min, na.rm = TRUE)
  # replace the missing values in the real samples with the minimum of the samples
  # then take the log
  smpls.noNA <- raw.dataframe  %>%
    filter(Type== "sample") %>% 
    dplyr::select(Samples:Group) %>%
    bind_cols(
      smpls %>%
        replace_na(replace = smpls.min) %>% 
        log2()
      )
  # QC
  QC <- raw.dataframe %>%
    filter(Type == "mix") %>% 
    dplyr::select(starts_with(start.prefix))
  QC.min <- lapply(QC, min, na.rm = TRUE)
  # replace the missing values in the QC with the minimum of the QC
  # then take the log
  QC.noNA <- raw.dataframe  %>%
    filter(Type == "mix") %>% 
    dplyr::select(Samples:Group) %>%
    bind_cols(
      QC %>%
        replace_na(replace = QC.min) %>% 
        log2()
      )
  # replace the missing values in solv and empty samples with 2 - for PCA analysis
  # then take the log
  other.min <- setNames(
    as.list(
      rep(2, ncol(
        raw.dataframe %>% 
          dplyr::select(starts_with(start.prefix))))
      ),
    colnames(raw.dataframe %>% dplyr::select(starts_with(start.prefix)))
                )
  other.num.log <- raw.dataframe %>%
    filter(Type != "sample" & Type != "mix") %>% 
    dplyr::select(Samples:Group) %>%
    bind_cols(
      raw.dataframe %>% 
        filter(Type != "sample" & Type != "mix") %>% 
        dplyr::select(starts_with(start.prefix)) %>%
        replace_na(replace = other.min) %>% 
        log2()
      )
  # combine them together back into one data frame
  all.noNA <- smpls.noNA %>% 
    bind_rows(QC.noNA) %>% 
    bind_rows(other.num.log)
}
HeatmapPrepAlt <- function(raw.data, start.prefix){
  # function for preparing dara for heatmap viz
  x <- raw.data %>% 
    select(starts_with(start.prefix)) %>% 
    scale(center = TRUE, scale = TRUE) %>% 
    as.matrix() 
  row.names(x) <- raw.data$Samples
  return(x)
}

2 Data Exploration

2.1 Mass vs Retention Time

Q: What are the distributions of compound masses and retention times?

full.vpa.cmpnd <- vpa.cell.neg.compound.info %>% 
  mutate(Set = "Cells / Neg") %>% 
  bind_rows(vpa.cell.pos.compound.info %>% mutate(Set = "Cells / Pos"))
full.vpa.cmpnd %>% 
  ggplot(aes(x = rt, y = mass)) +
  geom_point(size = 3, alpha = 0.5) +
  xlab("Retention Time (min)") +
  ylab("Mass (Da)") +
  ggtitle("Mass v RT\nVPA-only HILIC Exp") +
  ylim(0, 1000)

full.vpa.cmpnd %>% 
  ggplot(aes(x = rt, y = mass, color = Set)) +
  geom_point(size = 3, alpha = 0.8) +
  xlab("Retnetion Time (min)") +
  ylab("Mass (Da)") +
  ggtitle("Mass v RT\nVPA-only HILIC Exp") +
  facet_wrap(~ Set) +
  ylim(0, 1000)

Q: Which compounds were found in one or more of the data types?

vpa.cell.cmpnd.join <- vpa.cell.neg.compound.info %>% 
  inner_join(vpa.cell.pos.compound.info, by = "cas_id", suffix = c(".c.n", ".c.p")) %>% 
  select(
    contains("cas_id"), contains("short"), 
    contains("full"), starts_with("formula"), 
    starts_with("mass"), starts_with("rt")
    )
# compound names - found in pos and neg mode / cells 
print(vpa.cell.cmpnd.join$compound_full.c.n)
 [1] "Alanine"                                          
 [2] "Beta-Alanine"                                     
 [3] "Sarcosine"                                        
 [4] "BAIBA"                                            
 [5] "Serine"                                           
 [6] "Hypotaurine"                                      
 [7] "Proline"                                          
 [8] "Valine"                                           
 [9] "Threonine"                                        
[10] "Taurine"                                          
[11] "Ketoleucine"                                      
[12] "5-Aminolevulinic Acid"                            
[13] "cis-4-Hydroxyproline"                             
[14] "Creatine"                                         
[15] "Leucine"                                          
[16] "Isoleucine"                                       
[17] "Asparagine"                                       
[18] "Aspartic Acid"                                    
[19] "Adenine"                                          
[20] "Glutamine"                                        
[21] "Lysine"                                           
[22] "Glutamic Acid"                                    
[23] "Methionine"                                       
[24] "Xanthine"                                         
[25] "Histidine"                                        
[26] "Allantoin"                                        
[27] "Phenylalanine"                                    
[28] "Pyridoxal"                                        
[29] "Pyridoxine"                                       
[30] "Glycerol 2-Phosphate"                             
[31] "Arginine"                                         
[32] "Tyrosine"                                         
[33] "D-Mannitol"                                       
[34] "D-Sorbitol"                                       
[35] "Phosphocholine"                                   
[36] "O-Phosphoserine"                                  
[37] "Tryptophan"                                       
[38] "Phosphocreatine"                                  
[39] "Pantothenic Acid"                                 
[40] "Cystathionine"                                    
[41] "Carnosine"                                        
[42] "Cytidine"                                         
[43] "Glycerol-3-phosphocholine"                        
[44] "D-Glucose 6-phosphate"                            
[45] "Thiamine (Vit B1)"                                
[46] "Inosine"                                          
[47] "5'-Methylthioadenosine"                           
[48] "N-Acetylaspartyl Glutamic Acid"                   
[49] "Glutathione (GSH)"                                
[50] "CMP"                                              
[51] "UMP"                                              
[52] "dAMP"                                             
[53] "3-Phosphoglyceroinositol"                         
[54] "AMP"                                              
[55] "CDP"                                              
[56] "UDP"                                              
[57] "ADP"                                              
[58] "UTP"                                              
[59] "CDP-Choline"                                      
[60] "dATP"                                             
[61] "ATP"                                              
[62] "GTP"                                              
[63] "Cyclic adenosine diphosphate ribose (cADP-ribose)"
[64] "ADP-Ribose"                                       
[65] "UDP-Galactose"                                    
[66] "ADP-Glucose"                                      
[67] "GDP-Glucose"                                      
[68] "UDP-N-Acetylgalactosamine"                        
[69] "GSSG"                                             
[70] "NAD"                                              
[71] "NADP"                                             
[72] "Coenzyme A (CoA)"                                 
[73] "Flavin adenine dinucleotide (FAD)"                
[74] "Acetyl-CoA"                                       
# percent of cell / neg compounds found in cell / pos 
round(nrow(vpa.cell.cmpnd.join) * 100 / nrow(vpa.cell.neg.compound.info), 1)
[1] 55.6
# percent of cell / neg compounds found in cell / pos 
round(nrow(vpa.cell.cmpnd.join) * 100 / nrow(vpa.cell.pos.compound.info), 1)
[1] 71.8
# any mass inconsistencies?
vpa.cell.cmpnd.join %>% 
  select(contains("mass")) %>% 
  ggpairs()

# any rt inconsistencies?
vpa.cell.cmpnd.join %>% 
  select(starts_with("rt")) %>% 
  ggpairs()

2.2 Missing Values

MissingPerSamplePlot(vpa.cell.neg.raw, "hVPAnC") +
  ggtitle("Missing Per Sample\nVPA-only HILIC / Cells / Neg Mode")

# remove neg P2C2
MissingPerSamplePlot(vpa.cell.pos.raw, "hVPApC") +
  ggtitle("Missing Per Sample\nVPA-only HILIC / Cells / Pos Mode")

vpa.cell.neg.raw <- vpa.cell.neg.raw %>% 
  filter(Samples != "P2C2")

Q: Are any of the compounds more than 20% missing in the experimental sample group? If there are any, they will be excluded from analysis.

(vpa.cell.neg.cmpnd.excl <- vpa.cell.neg.raw %>% 
  MissingPerCompound("hVPAnC") %>% 
  filter(percent_na > 20))
# A tibble: 1 x 4
  Compound na_count n_samples percent_na
  <chr>       <int>     <int>      <dbl>
1 hVPAnC86        4        10         40
vpa.cell.pos.raw %>% 
  MissingPerCompound("hVPApC") %>% 
  filter(percent_na > 20)
# A tibble: 0 x 4
# ... with 4 variables: Compound <chr>, na_count <int>, n_samples <int>,
#   percent_na <dbl>

2.3 Negative Controls and Compound Elimination

2.3.1 Cells / Negative Mode

vpa.cell.neg.raw.grp.mean <- vpa.cell.neg.raw %>% 
  group_by(Type) %>% 
  summarize_at(vars(matches("hVPAnC")), mean, na.rm = TRUE) %>% 
  gather(key = "Compound", value = "Type_mean_abun", -Type)
vpa.cell.neg.raw.grp.mean %>% 
  ggplot(aes(log2(Type_mean_abun), color = Type)) +
  geom_density(size = 2, alpha = 0.8) +
  ggtitle("Distribution of compound means\nVPA-only HILIC / Cells / Negative Mode\nGrouped by sample type")
Warning: Removed 51 rows containing non-finite values (stat_density).

vpa.cell.neg.raw.grp.mean.order <- vpa.cell.neg.raw.grp.mean %>% 
  filter(Type == "sample") %>% 
  arrange(Type_mean_abun)
vpa.cell.neg.raw %>% 
  select(Samples, Type, starts_with("hVPAnC")) %>% 
  gather("Compound", value = "Abundance", -c(Samples, Type)) %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = vpa.cell.neg.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Abundance), color = Type, group = Samples)) + 
  geom_line(alpha = 0.2, size = 1) + 
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  geom_line(
    data = vpa.cell.neg.raw.grp.mean %>% 
      mutate(Cmpnd_sort = factor(Compound, levels = vpa.cell.neg.raw.grp.mean.order$Compound)), 
    aes(Cmpnd_sort, log2(Type_mean_abun), color = Type, group = Type),
    size = 0.5
    ) +
  ggtitle("Profile Plot of all compound abundances\nWith average per sample type overlaid\nVPA-only HILIC / Cells / Negative Mode")
Warning: Removed 22 rows containing missing values (geom_path).
Warning: Removed 9 rows containing missing values (geom_path).

vpa.cell.neg.raw.grp.mean %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = vpa.cell.neg.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Type_mean_abun), color = Type, group = Type)) +
  geom_point(size = 1, alpha = 0.8) +
  geom_line(alpha = 0.8) +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  ylab("log2(Sample Type Mean)") +
  ggtitle("Profile Plot of compound means by sample type only\nVPA-only HILIC / Cells / Negative Mode")
Warning: Removed 51 rows containing missing values (geom_point).

Warning: Removed 9 rows containing missing values (geom_path).

vpa.cell.neg.raw.grp.diff <- vpa.cell.neg.raw.grp.mean %>% 
  spread(Type, Type_mean_abun) %>% 
  mutate(smpl_solv_diff = sample / solv)
vpa.cell.neg.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_solv_diff))) +
  geom_histogram(bins = 50)
Warning: Removed 51 rows containing non-finite values (stat_bin).

# include compounds with FC > 2.5 or FC is NA (indication of NA in solv)
vpa.cell.neg.cmpnd.to.incl <- vpa.cell.neg.raw.grp.diff %>% 
  filter(smpl_solv_diff > 2.5 | is.na(smpl_solv_diff)) %>% 
  filter(!(Compound %in% vpa.cell.neg.cmpnd.excl$Compound))
# original number of metabolites
nrow(vpa.cell.neg.raw.grp.diff)
[1] 133
# number after filtering
nrow(vpa.cell.neg.cmpnd.to.incl)
[1] 126

2.3.2 Cells / Positive Mode

vpa.cell.pos.raw.grp.mean <- vpa.cell.pos.raw %>% 
  group_by(Type) %>% 
  summarize_at(vars(matches("hVPApC")), mean, na.rm = TRUE) %>% 
  gather(key = "Compound", value = "Type_mean_abun", -Type)
vpa.cell.pos.raw.grp.mean %>% 
  ggplot(aes(log2(Type_mean_abun), color = Type)) +
  geom_density(size = 2, alpha = 0.8) +
  ggtitle("Distribution of compound means\nVPA-only HILIC / Cells / Positive Mode\nGrouped by sample type")
## Warning: Removed 28 rows containing non-finite values (stat_density).

vpa.cell.pos.raw.grp.mean.order <- vpa.cell.pos.raw.grp.mean %>% 
  filter(Type == "sample") %>% 
  arrange(Type_mean_abun)
vpa.cell.pos.raw %>% 
  select(Samples, Type, starts_with("hVPApC")) %>% 
  gather("Compound", value = "Abundance", -c(Samples, Type)) %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = vpa.cell.pos.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Abundance), color = Type, group = Samples)) + 
  geom_line(alpha = 0.2, size = 1) + 
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  geom_line(
    data = vpa.cell.pos.raw.grp.mean %>% 
      mutate(Cmpnd_sort = factor(Compound, levels = vpa.cell.pos.raw.grp.mean.order$Compound)), 
    aes(Cmpnd_sort, log2(Type_mean_abun), color = Type, group = Type),
    size = 0.5
    ) +
  ggtitle("Profile Plot of all compound abundances\nWith average per sample type overlaid\nVPA-only HILIC / Cells / Positive Mode")
## Warning: Removed 25 rows containing missing values (geom_path).
## Warning: Removed 5 rows containing missing values (geom_path).

vpa.cell.pos.raw.grp.mean %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = vpa.cell.pos.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Type_mean_abun), color = Type, group = Type)) +
  geom_point(size = 1, alpha = 0.8) +
  geom_line(alpha = 0.8) +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  ylab("log2(Sample Type Mean)") +
  ggtitle("Profile Plot of compound means by sample type only\nVPA-only HILIC / Cells / Positive Mode")
## Warning: Removed 28 rows containing missing values (geom_point).

## Warning: Removed 5 rows containing missing values (geom_path).

vpa.cell.pos.raw.grp.diff <- vpa.cell.pos.raw.grp.mean %>% 
  spread(Type, Type_mean_abun) %>% 
  mutate(smpl_solv_diff = sample / solv)
vpa.cell.pos.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_solv_diff))) +
  geom_histogram(bins = 50)
## Warning: Removed 28 rows containing non-finite values (stat_bin).

# include compounds with FC > 2.5 or FC is NA (indication of NA in solv)
vpa.cell.pos.cmpnd.to.incl <- vpa.cell.pos.raw.grp.diff %>% 
  filter(smpl_solv_diff > 2.5 | is.na(smpl_solv_diff))
# original number
nrow(vpa.cell.pos.raw.grp.diff)
## [1] 103
# number after filtering
nrow(vpa.cell.pos.cmpnd.to.incl)
## [1] 98

3 Data Prep and Preliminary Analysis

3.1 Cleanup

vpa.cell.neg.noNA <- vpa.cell.neg.raw %>% 
  select(Samples:Group, one_of(vpa.cell.neg.cmpnd.to.incl$Compound)) %>% 
  ReplaceNAwMinLogTransformSingle("hVPAnC")
vpa.cell.pos.noNA <- vpa.cell.pos.raw %>% 
  select(Samples:Group, one_of(vpa.cell.pos.cmpnd.to.incl$Compound)) %>% 
  ReplaceNAwMinLogTransformSingle("hVPApC")

3.2 Distribution plots

3.2.1 Cells / Negative Mode

vpa.cell.neg.noNA.gathered <- vpa.cell.neg.noNA %>% 
  gather(
    key = "Metabolite", "Abundance", 
    which(colnames(vpa.cell.neg.noNA) == "hVPAnC10"):ncol(vpa.cell.neg.noNA)
    )
vpa.cell.neg.noNA.gathered %>% 
  ggplot(aes(Samples, Abundance, fill = Type)) +
  geom_boxplot() +
  geom_boxplot(aes(color = Type), fatten = NULL, fill = NA, coef = 0, outlier.alpha = 0, show.legend = FALSE) +
  theme(axis.text.x = element_text(angle = 90)) +
  ylab("log2(Abundance)") +
  ggtitle("Boxplot of compound abundances\nAll samples\nVPA-only HILIC / Cells / Negative Mode")
Warning: Removed 1 rows containing missing values (geom_segment).

Warning: Removed 1 rows containing missing values (geom_segment).

Warning: Removed 1 rows containing missing values (geom_segment).

Warning: Removed 1 rows containing missing values (geom_segment).

Warning: Removed 1 rows containing missing values (geom_segment).

Warning: Removed 1 rows containing missing values (geom_segment).

Warning: Removed 1 rows containing missing values (geom_segment).

Warning: Removed 1 rows containing missing values (geom_segment).

Warning: Removed 1 rows containing missing values (geom_segment).

Warning: Removed 1 rows containing missing values (geom_segment).

Warning: Removed 1 rows containing missing values (geom_segment).

Warning: Removed 1 rows containing missing values (geom_segment).

Warning: Removed 1 rows containing missing values (geom_segment).

Warning: Removed 1 rows containing missing values (geom_segment).

Warning: Removed 1 rows containing missing values (geom_segment).

Warning: Removed 1 rows containing missing values (geom_segment).

# same data format, but as ridge plots
vpa.cell.neg.noNA.gathered %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Type)) + 
  geom_density_ridges(scale = 15) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nAll samples\nVPA-only HILIC / Cells / Negative Mode")
Picking joint bandwidth of 0.995

# experimental samples only
vpa.cell.neg.noNA.gathered %>% 
  filter(Type == "sample") %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Group)) + 
  geom_density_ridges(scale = 10) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nExperimental samples only\nVPA-only HILIC / Cells / Negative Mode")
Picking joint bandwidth of 0.932

# overlay the distributions for another look
vpa.cell.neg.noNA.gathered %>%
  filter(Type == "sample") %>% 
  ggplot(aes(Abundance, group = Samples, color = Group)) +
  geom_density(alpha = 0.8, size = 0.75) +
  xlab("log2(Abundance)") +
  ggtitle("Density plot of compound abundances\nExperimental samples only\nVPA-only HILIC / Cells / Negative Mode")

3.2.2 Cells / Positive Mode

vpa.cell.pos.noNA.gathered <- vpa.cell.pos.noNA %>% 
  gather(
    key = "Metabolite", "Abundance", 
    which(colnames(vpa.cell.pos.noNA) == "hVPApC1"):ncol(vpa.cell.pos.noNA)
    )
vpa.cell.pos.noNA.gathered %>% 
  ggplot(aes(Samples, Abundance, fill = Type)) +
  geom_boxplot() +
  geom_boxplot(aes(color = Type), fatten = NULL, fill = NA, coef = 0, outlier.alpha = 0, show.legend = FALSE) +
  theme(axis.text.x = element_text(angle = 90)) +
  ylab("log2(Abundance)") +
  ggtitle("Boxplot of compound abundances\nAll samples\nVPA-only HILIC / Cells / Positive Mode")
Warning: Removed 1 rows containing missing values (geom_segment).

Warning: Removed 1 rows containing missing values (geom_segment).

Warning: Removed 1 rows containing missing values (geom_segment).

Warning: Removed 1 rows containing missing values (geom_segment).

Warning: Removed 1 rows containing missing values (geom_segment).

Warning: Removed 1 rows containing missing values (geom_segment).

Warning: Removed 1 rows containing missing values (geom_segment).

Warning: Removed 1 rows containing missing values (geom_segment).

Warning: Removed 1 rows containing missing values (geom_segment).

Warning: Removed 1 rows containing missing values (geom_segment).

Warning: Removed 1 rows containing missing values (geom_segment).

Warning: Removed 1 rows containing missing values (geom_segment).

Warning: Removed 1 rows containing missing values (geom_segment).

Warning: Removed 1 rows containing missing values (geom_segment).

Warning: Removed 1 rows containing missing values (geom_segment).

Warning: Removed 1 rows containing missing values (geom_segment).

Warning: Removed 1 rows containing missing values (geom_segment).

Warning: Removed 1 rows containing missing values (geom_segment).

Warning: Removed 1 rows containing missing values (geom_segment).

vpa.cell.pos.noNA.gathered %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Type)) + 
  geom_density_ridges(scale = 15) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nAll samples\nVPA-only HILIC / Cells / Positive Mode")
Picking joint bandwidth of 1.3

vpa.cell.pos.noNA.gathered %>% 
  filter(Type == "sample") %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Group)) + 
  geom_density_ridges(scale = 10) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nExperimental samples only\nVPA-only HILIC / Cells / Positive Mode")
Picking joint bandwidth of 1.19

vpa.cell.pos.noNA.gathered %>%
  filter(Type == "sample") %>% 
  ggplot(aes(Abundance, group = Samples, color = Group)) +
  geom_density(alpha = 0.8, size = 0.75) +
  xlab("log2(Abundance)") +
  ggtitle("Density plot of compound abundances\nExperimental samples only\nVPA-only HILIC / Cells / Positive Mode")

3.3 Principal Component Analysis

Some plots to understand the relationship between the samples, QC samples, solvent, and empty samples in some cases.

3.3.1 Cells / Negative Mode

### PCA on all Samples ###
vpa.cell.neg.full.pca <- vpa.cell.neg.noNA %>% 
  select(starts_with("hVPAnC")) %>% 
  # good idea to center data before pca, but scaling should not be necessary
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
# plot variance explained by each new principal component
plot(
  (vpa.cell.neg.full.pca$sdev ^ 2) * 100 / sum(vpa.cell.neg.full.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nAll samples only\nVPA-only HILIC / Cells / Negative Mode",
  type = "b"
  )

vpa.cell.neg.full.pca.x <- as.data.frame(vpa.cell.neg.full.pca$x)
row.names(vpa.cell.neg.full.pca.x) <- vpa.cell.neg.noNA$Samples
vpa.cell.neg.full.pca.x <- vpa.cell.neg.full.pca.x %>% 
  bind_cols(vpa.cell.neg.noNA %>% select(Type:Group))
vpa.cell.neg.full.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Type)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (93.8% Var)") +
  ylab("PC2 (3.1%)") +
  ggtitle("Principal Component Analysis\nAll Samples\nVPA-only HILIC / Cells / Negative Mode")

### Samples and Mix ###
vpa.cell.neg.smpl.mix.pca <- vpa.cell.neg.noNA %>% 
  filter(Type == "sample" | Type == "mix") %>% 
  select(starts_with("hVPAnC")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vpa.cell.neg.smpl.mix.pca$sdev ^ 2) * 100 / sum(vpa.cell.neg.smpl.mix.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nSamples and Mix\nVPA-only HILIC / Cells / Negative Mode",
  type = "b"
  )

vpa.cell.neg.smpl.mix.pca.x <- as.data.frame(vpa.cell.neg.smpl.mix.pca$x)
vpa.cell.neg.smpl.mix.pca.x <- vpa.cell.neg.smpl.mix.pca.x %>% 
  bind_cols(
    vpa.cell.neg.noNA %>% 
      filter(Type == "sample" | Type == "mix") %>% 
      select(Samples, Type:Group)
  )
row.names(vpa.cell.neg.smpl.mix.pca.x) <- vpa.cell.neg.smpl.mix.pca.x$Samples
vpa.cell.neg.smpl.mix.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (81.9% Var)") +
  ylab("PC2 (7.6%)") +
  ggtitle("Principal Component Analysis\nSamples and Mix\nVPA-only HILIC / Cells / Negative Mode")

### Experimental Samples Only ###
vpa.cell.neg.smpl.pca <- vpa.cell.neg.noNA %>% 
  filter(Type == "sample") %>% 
  select(starts_with("hVPAnC")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vpa.cell.neg.smpl.pca$sdev ^ 2) * 100 / sum(vpa.cell.neg.smpl.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nExperimental samples only\nVPA-only HILIC / Cells / Negative Mode",
  type = "b"
  )

vpa.cell.neg.smpl.pca.x <- as.data.frame(vpa.cell.neg.smpl.pca$x)
vpa.cell.neg.smpl.pca.x <- vpa.cell.neg.smpl.pca.x %>% 
  bind_cols(
    vpa.cell.neg.noNA %>% 
      filter(Type == "sample") %>% 
      select(Samples, Type:Group)
  )
row.names(vpa.cell.neg.smpl.pca.x) <- vpa.cell.neg.smpl.pca.x$Samples
vpa.cell.neg.smpl.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (57.2% Var)") +
  ylab("PC2 (16.1%)") +
  ggtitle("Principal Component Analysis\nExperimental samples only\nVPA-only HILIC / Cells / Negative Mode")

vpa.cell.neg.smpl.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (11.2% Var)") +
  ylab("PC4 (6.1%)") +
  ggtitle("Principal Component Analysis\nExperimental samples only\nVPA-only HILIC / Cells / Negative Mode")

3.3.2 Cells / Positive Mode

### PCA on all Samples ###
vpa.cell.pos.full.pca <- vpa.cell.pos.noNA %>% 
  select(starts_with("hVPApC")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vpa.cell.pos.full.pca$sdev ^ 2) * 100 / sum(vpa.cell.pos.full.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nAll samples only\nVPA-only HILIC / Cells / Positive Mode",
  type = "b"
  )

vpa.cell.pos.full.pca.x <- as.data.frame(vpa.cell.pos.full.pca$x)
row.names(vpa.cell.pos.full.pca.x) <- vpa.cell.pos.noNA$Samples
vpa.cell.pos.full.pca.x <- vpa.cell.pos.full.pca.x %>% 
  bind_cols(vpa.cell.pos.noNA %>% select(Type:Group))
vpa.cell.pos.full.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Type)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (89.1% Var)") +
  ylab("PC2 (6.2%)") +
  ggtitle("Principal Component Analysis\nAll Samples\nVPA-only HILIC / Cells / Positive Mode")

### Samples and Mix ###
vpa.cell.pos.smpl.mix.pca <- vpa.cell.pos.noNA %>% 
  filter(Type == "sample" | Type == "mix") %>% 
  select(starts_with("hVPApC")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vpa.cell.pos.smpl.mix.pca$sdev ^ 2) * 100 / sum(vpa.cell.pos.smpl.mix.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nSamples and Mix\nVPA-only HILIC / Cells / Positive Mode",
  type = "b"
  )

vpa.cell.pos.smpl.mix.pca.x <- as.data.frame(vpa.cell.pos.smpl.mix.pca$x)
vpa.cell.pos.smpl.mix.pca.x <- vpa.cell.pos.smpl.mix.pca.x %>% 
  bind_cols(
    vpa.cell.pos.noNA %>% 
      filter(Type == "sample" | Type == "mix") %>% 
      select(Samples, Type:Group)
  )
row.names(vpa.cell.pos.smpl.mix.pca.x) <- vpa.cell.pos.smpl.mix.pca.x$Samples
vpa.cell.pos.smpl.mix.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (86.2% Var)") +
  ylab("PC2 (4.7%)") +
  ggtitle("Principal Component Analysis\nSamples and Mix\nVPA-only HILIC / Cells / Positive Mode")

### Experimental Samples Only ###
vpa.cell.pos.smpl.pca <- vpa.cell.pos.noNA %>% 
  filter(Type == "sample") %>% 
  select(starts_with("hVPApC")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vpa.cell.pos.smpl.pca$sdev ^ 2) * 100 / sum(vpa.cell.pos.smpl.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nExperimental samples only\nVPA-only HILIC / Cells / Positive Mode",
  type = "b"
  )

vpa.cell.pos.smpl.pca.x <- as.data.frame(vpa.cell.pos.smpl.pca$x)
vpa.cell.pos.smpl.pca.x <- vpa.cell.pos.smpl.pca.x %>% 
  bind_cols(
    vpa.cell.pos.noNA %>% 
      filter(Type == "sample") %>% 
      select(Samples, Type:Group)
  )
row.names(vpa.cell.pos.smpl.pca.x) <- vpa.cell.pos.smpl.pca.x$Samples
vpa.cell.pos.smpl.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (38.8% Var)") +
  ylab("PC2 (29.0%)") +
  ggtitle("Principal Component Analysis\nExperimental samples only\nVPA-only HILIC / Cells / Positive Mode")

vpa.cell.pos.smpl.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (15.2% Var)") +
  ylab("PC4 (6.8%)") +
  ggtitle("Principal Component Analysis\nExperimental samples only\nVPA-only HILIC / Cells / Positive Mode")

4 Statistical analysis

4.1 Cells / Negative Mode

# sample prep
vpa.cell.neg.smpl.data <- vpa.cell.neg.noNA %>% 
    filter(Type == "sample")
vpa.cell.neg.data.for.sva <- as.matrix(
  vpa.cell.neg.smpl.data[, which(
    colnames(vpa.cell.neg.smpl.data) == "hVPAnC10"
    ):ncol(vpa.cell.neg.smpl.data)]
  )
row.names(vpa.cell.neg.data.for.sva) <- vpa.cell.neg.smpl.data$Samples
vpa.cell.neg.data.for.sva <- t(vpa.cell.neg.data.for.sva)
# pheno prep
vpa.cell.neg.data.pheno <- as.data.frame(vpa.cell.neg.smpl.data[, 3:4])
row.names(vpa.cell.neg.data.pheno) <- vpa.cell.neg.smpl.data$Samples
# sva calculation
vpa.cell.neg.mod.vpa <- model.matrix(~ as.factor(Group), data = vpa.cell.neg.data.pheno)
vpa.cell.neg.mod0 <- model.matrix(~ 1, data = vpa.cell.neg.data.pheno)
set.seed(2018)
num.sv(vpa.cell.neg.data.for.sva, vpa.cell.neg.mod.vpa, method = "be")
[1] 1
set.seed(2018)
num.sv(vpa.cell.neg.data.for.sva, vpa.cell.neg.mod.vpa, method = "leek")
[1] 0
set.seed(2018)
vpa.cell.neg.sv <- sva(vpa.cell.neg.data.for.sva, vpa.cell.neg.mod.vpa, vpa.cell.neg.mod0)
Number of significant surrogate variables is:  1 
Iteration (out of 5 ):1  2  3  4  5  
# extract the surrogate variables
vpa.cell.neg.surr.var <- as.data.frame(vpa.cell.neg.sv$sv)
colnames(vpa.cell.neg.surr.var) <- c("S1")

vpa.cell.neg.noNA %>% 
  filter(Type == "sample") %>% 
  select(Samples, Group) %>% 
  bind_cols(
    vpa.cell.neg.surr.var
    ) %>% 
  ggplot(aes(Group, S1, color = Group)) +
  geom_boxplot() +
  geom_jitter(size = 2, width = 0.1)

vpa.cell.neg.noNA %>% 
  filter(Type == "sample") %>% 
  select(Samples, Group) %>% 
  bind_cols(
    vpa.cell.neg.surr.var
    ) %>% 
  ggplot(aes(S1, fill = Group)) +
  geom_histogram(position = "dodge")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#P2C3

colnames(vpa.cell.neg.mod.vpa) <- c("cntrl", "VPAvsCNTRL")
# combine the full model matrix and the surrogate variables into one
vpa.cell.neg.d.sv <- cbind(vpa.cell.neg.mod.vpa, vpa.cell.neg.surr.var)  
head(vpa.cell.neg.d.sv)
     cntrl VPAvsCNTRL         S1
P1C1     1          0 -0.2664254
P1C2     1          0 -0.2341810
P1C3     1          0  0.2084435
P2C1     1          0 -0.2479613
P2C3     1          0  0.7338837
P1V1     1          1 -0.3380811
vpa.cell.neg.top.table <- vpa.cell.neg.data.for.sva %>% 
  lmFit(vpa.cell.neg.d.sv) %>% 
  # calculate the test statistics
  eBayes() %>% 
  # select the top features that have a p-value < 0.05 after Bonferroni multiple hypothesis correction
  topTable(coef = "VPAvsCNTRL", adjust = "bonferroni", p.value = 0.05, n = nrow(vpa.cell.neg.data.for.sva))
vpa.cell.neg.top.w.info <- vpa.cell.neg.top.table %>% 
  rownames_to_column("compound_short") %>% 
  mutate(
    vpa_div_cntrl = 2 ^ logFC,
    change_in_vpa = ifelse(vpa_div_cntrl < 1, "down", "up")
    ) %>% 
  inner_join(vpa.cell.neg.compound.info, by = "compound_short") %>% 
  filter(vpa_div_cntrl > 1.2 | vpa_div_cntrl < 0.83) %>%  
  arrange(change_in_vpa, desc(vpa_div_cntrl))
vpa.cell.neg.top.w.info %>% 
  select(compound_short, compound_full, change_in_vpa, vpa_div_cntrl)
   compound_short                   compound_full change_in_vpa
1        hVPAnC44                         Ribitol          down
2        hVPAnC11                   Glyceric Acid          down
3        hVPAnC68                      D-Sorbitol          down
4       hVPAnC118                     CDP-Choline            up
5        hVPAnC35                   Caprylic Acid            up
6        hVPAnC97  N-Acetylaspartyl Glutamic Acid            up
7        hVPAnC31                   Aspartic Acid            up
8       hVPAnC103 Docosahexaenoic Acid (22:6 n-3)            up
9        hVPAnC14                         Proline            up
10       hVPAnC99               Glutathione (GSH)            up
11       hVPAnC40                   Glutamic Acid            up
12        hVPAnC8                            GABA            up
13       hVPAnC80                   Cystathionine            up
   vpa_div_cntrl
1      0.7542940
2      0.6495347
3      0.3008038
4      2.9249716
5      2.6685912
6      1.8002089
7      1.7014029
8      1.6790242
9      1.6132250
10     1.4222172
11     1.2917915
12     1.2897935
13     1.2391877
#write_csv(path = "./results/vpa_hilic_cell_neg_top_hits_w_FC_pval.csv", vpa.cell.neg.top.w.info)
vpa.cell.neg.gathered <- vpa.cell.neg.noNA %>%
  filter(Type == "sample") %>% 
  bind_cols(vpa.cell.neg.surr.var) %>% 
  select(Samples, Group, S1, starts_with("hVPAnC")) %>% 
  gather(key = "Compound", value = "Abundance", hVPAnC10:hVPAnC99)
vpa.cell.neg.nested <- vpa.cell.neg.gathered %>% 
  group_by(Compound) %>% 
  nest() %>%
  mutate(model = map(data, ~lm(Abundance ~ S1, data = .))) %>% 
  mutate(augment_model = map(model, augment))
vpa.cell.neg.modSV.resid <- vpa.cell.neg.nested %>% 
  unnest(data, augment_model) %>% 
  select(Samples, Group, Compound, .resid) %>% 
  spread(Compound, .resid) 
vpa.cell.neg.modSV.resid %>% 
  select(Samples, one_of(vpa.cell.neg.top.w.info$compound_short)) %>% 
  HeatmapPrepAlt("hVPAnC") %>% 
  t() %>% 
  heatmaply(
    colors = viridis(n = 10, option = "magma"), 
    xlab = "Samples", ylab = "Compounds",
    main = "Statistically significant compounds\nVPA-Only HILIC / Cells / Negative Mode",
    margins = c(50, 50, 75, 30),
    labRow = vpa.cell.neg.top.w.info$compound_full,
    k_col = 2, k_row = 2
    )
### PCA - cleaned data ###
vpa.cell.neg.modSV.pca <- vpa.cell.neg.modSV.resid %>% 
  select(starts_with("hVPAnC")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
vpa.cell.neg.modSV.pca.x <- as.data.frame(vpa.cell.neg.modSV.pca$x)
row.names(vpa.cell.neg.modSV.pca.x) <- vpa.cell.neg.modSV.resid$Samples
vpa.cell.neg.modSV.pca.x <- vpa.cell.neg.modSV.pca.x %>% 
  bind_cols(vpa.cell.neg.modSV.resid %>% select(Group))
vpa.cell.neg.modSV.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (38.6% Var)") +
  ylab("PC2 (25.9% Var)")

vpa.cell.neg.modSV.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (13.8% Var)") +
  ylab("PC4 (6.8% Var)")

4.2 Cells / Positive Mode

# sample prep
vpa.cell.pos.smpl.data <- vpa.cell.pos.noNA %>% 
    filter(Type == "sample")
vpa.cell.pos.data.for.sva <- as.matrix(
  vpa.cell.pos.smpl.data[, which(
    colnames(vpa.cell.pos.smpl.data) == "hVPApC1"
    ):ncol(vpa.cell.pos.smpl.data)]
  )
row.names(vpa.cell.pos.data.for.sva) <- vpa.cell.pos.smpl.data$Samples
vpa.cell.pos.data.for.sva <- t(vpa.cell.pos.data.for.sva)
vpa.cell.pos.data.pheno <- as.data.frame(vpa.cell.pos.smpl.data[, 3:4])
row.names(vpa.cell.pos.data.pheno) <- vpa.cell.pos.smpl.data$Samples
vpa.cell.pos.mod.vpa <- model.matrix(~ as.factor(Group), data = vpa.cell.pos.data.pheno)
vpa.cell.pos.mod0 <- model.matrix(~ 1, data = vpa.cell.pos.data.pheno)
set.seed(2018)
num.sv(vpa.cell.pos.data.for.sva, vpa.cell.pos.mod.vpa, method = "be")
[1] 2
set.seed(2018)
num.sv(vpa.cell.pos.data.for.sva, vpa.cell.pos.mod.vpa, method = "leek")
[1] 0
set.seed(2018)
vpa.cell.pos.sv <- sva(vpa.cell.pos.data.for.sva, vpa.cell.pos.mod.vpa, vpa.cell.pos.mod0)
Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  
vpa.cell.pos.surr.var <- as.data.frame(vpa.cell.pos.sv$sv)
colnames(vpa.cell.pos.surr.var) <- c("S1", "S2")


vpa.cell.pos.noNA %>% 
  filter(Type == "sample") %>% 
  select(Samples, Group) %>% 
  bind_cols(
    vpa.cell.pos.surr.var
    ) %>% 
  gather("surr_var", "value", S1:S2) %>% 
  ggplot(aes(Group, value, color = Group)) +
  geom_boxplot() +
  geom_jitter(size = 2, width = 0.1) +
  facet_wrap(~ surr_var)

vpa.cell.pos.covar <- vpa.cell.pos.smpl.pca.x %>% 
  select(Samples:Group, PC1:PC4) %>%
  bind_cols(vpa.cell.pos.surr.var)


colnames(vpa.cell.pos.mod.vpa) <- c("cntrl", "VPAvsCNTRL")
vpa.cell.pos.d.sv <- cbind(vpa.cell.pos.mod.vpa, vpa.cell.pos.surr.var)  
vpa.cell.pos.top.table <- vpa.cell.pos.data.for.sva %>% 
  lmFit(vpa.cell.pos.d.sv) %>% 
  eBayes() %>% 
  topTable(coef = "VPAvsCNTRL", adjust = "bonferroni", p.value = 0.05, n = nrow(vpa.cell.pos.data.for.sva))
vpa.cell.pos.top.w.info <- vpa.cell.pos.top.table %>% 
  rownames_to_column("compound_short") %>% 
  mutate(
    vpa_div_cntrl = 2 ^ logFC,
    change_in_vpa = ifelse(vpa_div_cntrl < 1, "down", "up")
    ) %>% 
  inner_join(vpa.cell.pos.compound.info, by = "compound_short") %>% 
  filter(vpa_div_cntrl > 1.2 | vpa_div_cntrl < 0.83) %>%  
  arrange(change_in_vpa, desc(vpa_div_cntrl))
vpa.cell.pos.top.w.info %>% 
  select(compound_short, compound_full, change_in_vpa, vpa_div_cntrl)
   compound_short                  compound_full change_in_vpa
1        hVPApC78    Nicotinamide Mononucleotide          down
2        hVPApC65      Glycerol-3-phosphocholine          down
3        hVPApC94                    ADP-Glucose            up
4        hVPApC55                O-Phosphoserine            up
5        hVPApC87                    CDP-Choline            up
6        hVPApC73 N-Acetylaspartyl Glutamic Acid            up
7        hVPApC13                        Proline            up
8        hVPApC29                  Aspartic Acid            up
9        hVPApC97                           GSSG            up
10       hVPApC44                Methyl-L-Lysine            up
11       hVPApC37                  Glutamic Acid            up
   vpa_div_cntrl
1      0.5161583
2      0.3167073
3      3.0205769
4      2.2837832
5      2.2692412
6      1.8012584
7      1.7208174
8      1.6606775
9      1.3542636
10     1.3304318
11     1.2362496
#write_csv(path = "./results/vpa_hilic_cell_pos_top_hits_w_FC_pval.csv", vpa.cell.pos.top.w.info)
vpa.cell.pos.gathered <- vpa.cell.pos.noNA %>%
  filter(Type == "sample") %>% 
  bind_cols(vpa.cell.pos.surr.var) %>% 
  select(Samples, Group, S1:S2, starts_with("hVPApC")) %>% 
  gather(key = "Compound", value = "Abundance", hVPApC1:hVPApC99)
vpa.cell.pos.nested <- vpa.cell.pos.gathered %>% 
  group_by(Compound) %>% 
  nest() %>%
  mutate(model = map(data, ~lm(Abundance ~ S1 + S2, data = .))) %>% 
  mutate(augment_model = map(model, augment))
vpa.cell.pos.modSV.resid <- vpa.cell.pos.nested %>% 
  unnest(data, augment_model) %>% 
  select(Samples, Group, Compound, .resid) %>% 
  spread(Compound, .resid) 
vpa.cell.pos.modSV.resid %>% 
  select(Samples, one_of(vpa.cell.pos.top.w.info$compound_short)) %>% 
  HeatmapPrepAlt("hVPApC") %>% 
  t() %>% 
  heatmaply(
    colors = viridis(n = 10, option = "magma"), 
    xlab = "Samples", ylab = "Compounds",
    main = "Statistically significant compounds\nVPA-Only HILIC / Cells / Positive Mode",
    margins = c(50, 50, 75, 30),
    labRow = vpa.cell.pos.top.w.info$compound_full,
    k_col = 2, k_row = 2
    )
### PCA - cleaned data ###
vpa.cell.pos.modSV.pca <- vpa.cell.pos.modSV.resid %>% 
  select(starts_with("hVPApC")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
vpa.cell.pos.modSV.pca.x <- as.data.frame(vpa.cell.pos.modSV.pca$x)
row.names(vpa.cell.pos.modSV.pca.x) <- vpa.cell.pos.modSV.resid$Samples
vpa.cell.pos.modSV.pca.x <- vpa.cell.pos.modSV.pca.x %>% 
  bind_cols(vpa.cell.pos.modSV.resid %>% select(Group))
vpa.cell.pos.modSV.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (47.1% Var)") +
  ylab("PC2 (21.3% Var)")

vpa.cell.pos.modSV.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (12.9% Var)") +
  ylab("PC4 (7.6% Var)")

4.3 Write to csv

(not evaluated)

#write_csv(vpa.cell.neg.modSV.resid, path = "./results/modsv_resid/vpa_hilic_cell_neg_modSV_resid.csv")
#write_csv(vpa.cell.pos.modSV.resid, path = "./results/modSV_resid/vpa_hilic_cell_pos_modSV_resid.csv")

4.4 Profile plots

### Neg Mode ###
vpa.cell.neg.resid.for.profile <- vpa.cell.neg.modSV.resid %>% 
  select(Samples:Group, one_of(vpa.cell.neg.top.w.info$compound_short)) %>% 
  gather("Compound", value = "Abundance", -c(Samples, Group))
vpa.cell.neg.resid.for.profile %>% 
  group_by(Samples) %>% 
  count() %>% 
  filter(n != 13)
# A tibble: 0 x 2
# Groups:   Samples [0]
# ... with 2 variables: Samples <chr>, n <int>
vpa.cell.neg.resid.order <- vpa.cell.neg.resid.for.profile %>% 
  group_by(Compound, Group) %>% 
  summarize(avg.abun = mean(Abundance)) %>% 
  ungroup() %>% 
  spread(key = "Group", value = "avg.abun") %>% 
  mutate(FC = vpa - cntrl) %>% 
  arrange(FC) %>% 
  mutate(compound_sort = factor(Compound)) %>% 
  inner_join(vpa.cell.neg.top.w.info, by = c("Compound" = "compound_short"))
vpa.cell.neg.modSV.resid.prof.plot <- vpa.cell.neg.resid.for.profile %>% 
  mutate(compound_order = factor(Compound, levels = vpa.cell.neg.resid.order$compound_sort)) %>% 
  ggplot(aes(compound_order, Abundance, color = Group, group = Samples)) + 
  geom_line(alpha = 0.4, size = 1.25) + 
  theme_minimal() +
  theme(
    axis.ticks.x = element_blank(),
    legend.title = element_blank(),
    axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)
    ) +
  scale_x_discrete(name = "Compound", labels = vpa.cell.neg.resid.order$compound_full) +
  scale_color_manual(values = c("#56B4E9","#E69F00"), labels = c("Control", "VPA")) +
  ylab("log2(Compound Resid)") +
  # overlay the group averages 
  geom_line(
    data = vpa.cell.neg.resid.order %>% gather("Treatment", "treat_mean", cntrl:vpa),
    aes(compound_sort, treat_mean, color = Treatment, group = Treatment),
    size = 2
    ) +
  theme(plot.margin = margin(5, 5, 5, 40))
vpa.cell.neg.modSV.resid.prof.plot

### Pos Mode ###
vpa.cell.pos.resid.for.profile <- vpa.cell.pos.modSV.resid %>% 
  select(Samples:Group, one_of(vpa.cell.pos.top.w.info$compound_short)) %>% 
  gather("Compound", value = "Abundance", -c(Samples, Group))
vpa.cell.pos.resid.for.profile %>% 
  group_by(Samples) %>% 
  count() %>% 
  filter(n != 11)
# A tibble: 0 x 2
# Groups:   Samples [0]
# ... with 2 variables: Samples <chr>, n <int>
vpa.cell.pos.resid.order <- vpa.cell.pos.resid.for.profile %>% 
  group_by(Compound, Group) %>% 
  summarize(avg.abun = mean(Abundance)) %>% 
  ungroup() %>% 
  spread(key = "Group", value = "avg.abun") %>% 
  mutate(FC = vpa - cntrl) %>% 
  arrange(FC) %>% 
  mutate(compound_sort = factor(Compound)) %>% 
  inner_join(vpa.cell.pos.top.w.info, by = c("Compound" = "compound_short"))
vpa.cell.pos.modSV.resid.prof.plot <- vpa.cell.pos.resid.for.profile %>% 
  mutate(compound_order = factor(Compound, levels = vpa.cell.pos.resid.order$compound_sort)) %>% 
  ggplot(aes(compound_order, Abundance, color = Group, group = Samples)) + 
  geom_line(alpha = 0.4, size = 1.25) + 
  theme_minimal() +
  theme(
    axis.ticks.x = element_blank(),
    legend.title = element_blank(),
    axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)
    
    ) +
  scale_x_discrete(name = "Compound", labels = vpa.cell.pos.resid.order$compound_full) +
  scale_color_manual(values = c("#56B4E9","#E69F00"), labels = c("Control", "VPA")) +
  ylab("log2(Compound Resid)") +
  # overlay the group averages 
  geom_line(
    data = vpa.cell.pos.resid.order %>% gather("Treatment", "treat_mean", cntrl:vpa),
    aes(compound_sort, treat_mean, color = Treatment, group = Treatment),
    size = 2
    ) +
  theme(plot.margin = margin(5, 5, 5, 40))
vpa.cell.pos.modSV.resid.prof.plot

vpa.hilic.resid.grid <- plot_grid(vpa.cell.neg.modSV.resid.prof.plot, vpa.cell.pos.modSV.resid.prof.plot, ncol = 1 , labels = c("A", "B"))
#save_plot(filename = "./results/vpa_hilic_sig_resid_plots.png", vpa.hilic.resid.grid, base_width = 6, base_height = 8)

5 Session Info

sessionInfo()
R version 3.5.3 (2019-03-11)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 7 x64 (build 7601) Service Pack 1

Matrix products: default

locale:
[1] LC_COLLATE=English_United States.1252 
[2] LC_CTYPE=English_United States.1252   
[3] LC_MONETARY=English_United States.1252
[4] LC_NUMERIC=C                          
[5] LC_TIME=English_United States.1252    

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] ggridges_0.5.1      biobroom_1.14.0     broom_0.5.1        
 [4] limma_3.38.3        sva_3.30.0          BiocParallel_1.16.2
 [7] genefilter_1.64.0   mgcv_1.8-28         nlme_3.1-137       
[10] heatmaply_0.15.2    viridis_0.5.1       viridisLite_0.3.0  
[13] plotly_4.8.0        GGally_1.4.0        cowplot_0.9.4      
[16] forcats_0.4.0       stringr_1.4.0       dplyr_0.8.0.1      
[19] purrr_0.3.2         readr_1.3.1         tidyr_0.8.3        
[22] tibble_2.1.1        ggplot2_3.1.0       tidyverse_1.2.1    

loaded via a namespace (and not attached):
  [1] colorspace_1.4-1     class_7.3-15         modeltools_0.2-22   
  [4] mclust_5.4.3         rstudioapi_0.10      flexmix_2.3-15      
  [7] bit64_0.9-7          fansi_0.4.0          AnnotationDbi_1.44.0
 [10] mvtnorm_1.0-10       lubridate_1.7.4      xml2_1.2.0          
 [13] codetools_0.2-16     splines_3.5.3        robustbase_0.93-4   
 [16] knitr_1.22           jsonlite_1.6         annotate_1.60.0     
 [19] cluster_2.0.7-1      kernlab_0.9-27       shiny_1.2.0         
 [22] compiler_3.5.3       httr_1.4.0           backports_1.1.3     
 [25] assertthat_0.2.1     Matrix_1.2-16        lazyeval_0.2.2      
 [28] cli_1.1.0            later_0.8.0          htmltools_0.3.6     
 [31] tools_3.5.3          gtable_0.2.0         glue_1.3.1          
 [34] reshape2_1.4.3       Rcpp_1.0.1           Biobase_2.42.0      
 [37] cellranger_1.1.0     trimcluster_0.1-2.1  gdata_2.18.0        
 [40] crosstalk_1.0.0      iterators_1.0.10     fpc_2.1-11.1        
 [43] xfun_0.5             rvest_0.3.2          mime_0.6            
 [46] gtools_3.8.1         XML_3.98-1.19        dendextend_1.10.0   
 [49] DEoptimR_1.0-8       MASS_7.3-51.1        scales_1.0.0        
 [52] TSP_1.1-6            promises_1.0.1       hms_0.4.2           
 [55] parallel_3.5.3       RColorBrewer_1.1-2   yaml_2.2.0          
 [58] memoise_1.1.0        gridExtra_2.3        reshape_0.8.8       
 [61] stringi_1.4.3        RSQLite_2.1.1        gclus_1.3.2         
 [64] S4Vectors_0.20.1     foreach_1.4.4        seriation_1.2-3     
 [67] caTools_1.17.1.2     BiocGenerics_0.28.0  matrixStats_0.54.0  
 [70] rlang_0.3.2          pkgconfig_2.0.2      prabclus_2.2-7      
 [73] bitops_1.0-6         evaluate_0.13        lattice_0.20-38     
 [76] labeling_0.3         htmlwidgets_1.3      bit_1.1-14          
 [79] tidyselect_0.2.5     plyr_1.8.4           magrittr_1.5        
 [82] R6_2.4.0             IRanges_2.16.0       gplots_3.0.1.1      
 [85] generics_0.0.2       DBI_1.0.0            pillar_1.3.1        
 [88] haven_2.1.0          whisker_0.3-2        withr_2.1.2         
 [91] survival_2.43-3      RCurl_1.95-4.12      nnet_7.3-12         
 [94] modelr_0.1.4         crayon_1.3.4         utf8_1.1.4          
 [97] KernSmooth_2.23-15   rmarkdown_1.12       grid_3.5.3          
[100] readxl_1.3.1         data.table_1.12.0    blob_1.1.1          
[103] digest_0.6.18        diptest_0.75-7       webshot_0.5.1       
[106] xtable_1.8-3         httpuv_1.5.0         stats4_3.5.3        
[109] munsell_0.5.0        registry_0.5-1